# cores to use for multiprocessing: is the number of cores is small -> we are not on a cluster -> use all; if the number of cores is large -> we are on cluster -> use only 15 or 7 or 31 cores (n requested-1)
if(detectCores() <= 4) cores_to_use = detectCores() - 1
if(detectCores() > 4) cores_to_use = 6
cores_to_use = NULL
# how many permutations?
N_permut = 1000
filename = paste0("./processed_data_files/predict_domain_human_clust",gsub("-","",Sys.Date()),".RData")
filename = paste0("./processed_data_files/predict_domain_human_clust",gsub("-","","2018-08-17"),".RData")
file_all = "./processed_data_files/human_net_w_domains"
gunzip(paste0(file_all,".gz"), remove = F)
data_all = fread(file_all, sep = "\t", stringsAsFactors = F)
data_all = data_all[IDs_interactor_human_B != "UNKNOWN"]
unlink(file_all)
file_BioPlex3 = "./processed_data_files/BioPlex3human_net_w_domains"
gunzip(paste0(file_BioPlex3,".gz"), remove = F)
data_BioPlex3 = fread(file_BioPlex3, sep = "\t", stringsAsFactors = F)
data_BioPlex3 = data_BioPlex3[IDs_interactor_human_B != "UNKNOWN"]
unlink(file_BioPlex3)
# count function: set up standard parameters
permuteCount = function(data, select_nodes = NULL, also_permuteYZ = F, N_permut = 10){
permutationPval(interactions2permute = IDs_interactor_human_A ~ IDs_interactor_human_B, # first set of interacting pairs (XY) that are to be permuted
associations2test = IDs_interactor_human_A ~ IDs_domain_human_B, # set of interacting pairs to be tested (XZ), YZ interactions are assumed
node_attr = list(IDs_interactor_human_A ~ IDs_interactor_human_A_degree,
IDs_domain_human_B ~ domain_count,
IDs_interactor_human_A + IDs_domain_human_B ~ domain_frequency_per_IDs_interactor_human_A),
data = data,
statistic = IDs_interactor_human_A + IDs_domain_human_B ~ .N,
select_nodes = select_nodes,
N = N_permut,
cores = NULL, seed = 2, also_permuteYZ = F,
clustermq = T, clustermq_mem = 12500,
split_comp_inner_N = 3, clustermq_jobs = 60,
clustermq_log_worker = F)
}
# count: all protein interactions in IntAct and all domains - # permute IDs_interactor_human_A ~ IDs_interactor_human_B
load(filename)
if(!"res_count_all" %in% objects()){
time = proc.time()
res_count_all = permuteCount(data_all, select_nodes = IDs_domain_human_B ~ domain_count >= 1,
N_permut = 1800)
proc.time() - time
# save results
save(list = ls(), file=filename)
}
plot(res_count_all, main = "count: all proteins and domains")
# count: protein interactions in BioPlex3 and all domains - # permute IDs_interactor_human_A ~ IDs_interactor_human_B
if(!"res_count_all" %in% objects()){
time = proc.time()
res_count_BioPlex3 = permuteCount(data_BioPlex3, select_nodes = IDs_domain_human_B ~ domain_count >= 1,
N_permut = 40)
proc.time() - time
# save results
save(list = ls(), file=filename)
}
plot(res_count_BioPlex3, main = "count: all proteins and domains (BioPlex)")
# count
PermutResult2D(res = res_count_all, N = 250) +
ggtitle("2D-bin plots of 250 top-scoring human protein - domain pairs, \n statistic: count of a domain among interacting partners of a protein (IntAct)")
# Fisher test p-value
PermutResult2D(res = res_count_BioPlex3, N = 250) +
ggtitle("2D-bin plots of 250 top-scoring human protein - domain pairs, \n statistic: count of a domain among interacting partners of a protein (BioPlex3)")
if(!file.exists("./data_files/interactiondomains.tsv")) downloader::download("http://elm.eu.org/interactiondomains.tsv","./data_files/interactiondomains.tsv")
interactiondomains = fread("./data_files/interactiondomains.tsv")
interactiondomains[, pfam_id := `Interaction Domain Id`]
domains_known = interactiondomains[, unique(pfam_id)]
InterProScan_domains = readInterProGFF3("../viral_project/processed_data_files/all_human_viral_protein_domains.gff3.gz")
# get InterProID to member database ID mapping
InterPro2memberDB = getInterPro2memberDB(InterProScan_domains)
InterPro2memberDB = InterPro2memberDB[complete.cases(InterPro2memberDB)]
domains_known_mapped = unique(InterPro2memberDB[memberDBID %in% domains_known | InterProID %in% domains_known, InterProID])
domains_not_mapped = unique(domains_known[!(domains_known %in% InterPro2memberDB$memberDBID | domains_known %in% InterPro2memberDB$InterProID)])
I did Fisher test to evaluate if the domains that we find are enriched in domains known to interact with linear motifs (from ELM). I have picked most likely human protein - human domain associations (by p-value). Then I counted how many known motif-binding domains we have found and did the Fisher test. Finally, I was choosing different p-value cutoffs.
testEnrichment = function(N, res, rank_by = "p.value", domains_known_mapped, random = F, name = "", decreasing = F){
if(random) {
res$data_pval = unique(res$data_with_pval[,c("IDs_interactor_human_A", "IDs_domain_human_B", rank_by, "domain_type", "domain_count", "IDs_interactor_human_A_degree"), with = F])
domains_found = res$data_pval[sample(1:nrow(res$data_with_pval), N), unique(IDs_domain_human_B)]
} else {
res$data_pval = unique(res$data_with_pval[,c("IDs_interactor_human_A", "IDs_domain_human_B", rank_by, "domain_type", "domain_count", "IDs_interactor_human_A_degree"), with = F])
ind = order(unlist(res$data_pval[, c(rank_by), with = F]), decreasing = decreasing)[1:N]
domains_found = res$data_pval[ind, unique(IDs_domain_human_B)]
}
alldomains = res$data_with_pval[, unique(IDs_domain_human_B)]
known = factor(alldomains %in% domains_known_mapped, levels = c("TRUE", "FALSE"))
found = factor(alldomains %in% domains_found, levels = c("TRUE", "FALSE"))
table_res = table(known, found)
test = fisher.test(table(known, found), alternative = "greater", conf.int = T)
return(c(pval = test$p.value, odds_ratio = as.vector(test$estimate),
count = table_res["TRUE", "TRUE"], total_count = sum(as.logical(found)),
name = name))
}
runningTestEnrichment = function(res, name, rank_by = "p.value", decreasing = F, Ns){
enrichment = sapply(Ns, testEnrichment, res = res,
domains_known_mapped = domains_known_mapped,
name = name, rank_by = rank_by, decreasing = decreasing)
colnames(enrichment) = Ns
return(enrichment)
}
Ns = seq(25, 1000, 40)
# count
enrichment = runningTestEnrichment(res_count_all, name = "domain frequency among interactors of a human protein, empirical pval", Ns = Ns)
enrichment_justfreq = runningTestEnrichment(res_count_all, rank_by = "observed_statistic", name = "domain count among interactors of a human protein, frequency", Ns = Ns)
enrichmentBioPlex3 = runningTestEnrichment(res_count_BioPlex3, name = "domain frequency among interactors of a human protein (BioPlex3), empirical pval", Ns = Ns)
random_domains = function(N = 100, seed = seed, Ns = seq(25, 500, 25), res){
set.seed(seed)
quantiles = c(0.975, 0.75, 0.5, 0.25, 0.025)
quantile_names = c("97.5% quantile", "75% quantile", "median", "25% quantile", "2.5% quantile")
all_temp = Q(fun = function(i) {
library(data.table)
enrichmentRANDOM = sapply(Ns, testEnrichment, res = res, domains_known_mapped = domains_known_mapped, random = T, name = "N random proteins")[1:3,]
rownames = rownames(enrichmentRANDOM)
enrichmentRANDOM = matrix(as.numeric(unlist(enrichmentRANDOM)),nr=nrow(enrichmentRANDOM))
colnames(enrichmentRANDOM) = Ns
rownames(enrichmentRANDOM) = rownames
enrichmentRANDOM
}, 1:N,
export = list(Ns = Ns, testEnrichment = testEnrichment,
res = res, domains_known_mapped = domains_known_mapped),
seed = seed, memory = 1000, n_jobs = 20, rettype = "list")
#pval_temp = replicate(N, {
# enrichmentRANDOM = sapply(Ns, testEnrichment, res = res, domains_known_mapped = domains_known_mapped, random = T, name = "N random proteins")[1,]
# names(enrichmentRANDOM) = Ns
# as.numeric(enrichmentRANDOM)
#})
pval_temp = sapply(all_temp, function(m) m[1,])
pval = apply(pval_temp, 1, quantile, probs = quantiles)
rownames(pval) = quantile_names
colnames(pval) = Ns
#odds_ratio_temp = replicate(N, {
# enrichmentRANDOM = sapply(Ns, testEnrichment, res = res, domains_known_mapped = domains_known_mapped, random = T, name = "N random proteins")[2,]
# names(enrichmentRANDOM) = Ns
# as.numeric(enrichmentRANDOM)
#})
odds_ratio_temp = sapply(all_temp, function(m) m[2,])
odds_ratio = apply(odds_ratio_temp, 1, quantile, probs = quantiles)
rownames(odds_ratio) = quantile_names
colnames(odds_ratio) = Ns
#count_temp = replicate(N, {
# enrichmentRANDOM = sapply(Ns, testEnrichment, res = res, domains_known_mapped = domains_known_mapped, random = T, name = "N random proteins")[3,]
# names(enrichmentRANDOM) = Ns
# as.numeric(enrichmentRANDOM)
#})
count_temp = sapply(all_temp, function(m) m[3,])
count = apply(count_temp, 1, quantile, probs = quantiles)
rownames(count) = quantile_names
colnames(count) = Ns
return(list(pval = pval, odds_ratio = odds_ratio, count = count))
}
if(!"enrichmentRANDOM" %in% objects()){
enrichmentRANDOM = random_domains(1000, 1, res = res_count_all)
# save results
save(list = ls(), file=filename)
}
## Connecting ebi via SSH ...
## Sending common data ...
## Submitting 20 worker jobs (ID: 6531) ...
## Running 1,000 calculations (1 calls/chunk) ...
## Master: [437.7s 0.8% CPU]; Worker: [avg 99.1% CPU, max 568.4 Mb]
As we include more proteins, the number of known domains we find increases and then levels off (probably because some of the known SLIM-binding domains do not interact with viral proteins). ????? is this true for human? should always increase
plotEnrichment(enrichment, enrichment_justfreq, enrichmentBioPlex3,
random_domains = enrichmentRANDOM,
domains_known_mapped = domains_known_mapped, type = "count", plot_type = "l")
plotEnrichment(enrichment, enrichment_justfreq, enrichmentBioPlex3,
random_domains = enrichmentRANDOM,
domains_known_mapped = domains_known_mapped, type = "odds_ratio", plot_type = "l")
plotEnrichment(enrichment, enrichment_justfreq, enrichmentBioPlex3,
random_domains = enrichmentRANDOM,
domains_known_mapped = domains_known_mapped, type = "pval", plot_type = "l")
selectTopHits = function(res, N){
res$data_pval = unique(res$data_with_pval[,.(IDs_interactor_human_A, IDs_domain_human_B, p.value, domain_type, domain_count, IDs_interactor_human_A_degree)])
pairs200pval = res$data_pval[order(p.value, decreasing = F)[1:N], max(p.value)]
restop100 = res
restop100$data_with_pval = restop100$data_with_pval[p.value <= pairs200pval, ]
restop100$data_with_pval[, N_human_per_human_w_domain := length(unique(IDs_interactor_human_A)), by = .(IDs_interactor_human_B, IDs_domain_human_B)]
return(restop100)
}
restop100 = selectTopHits(res_count_all, N = 250)
plot(restop100, IDs_interactor_human_B ~ N_human_per_human_w_domain)
plot(restop100)
1122 human proteins with enriched domains have 5 or more human interacting partners.
476 human proteins with enriched domains have 10 or more human interacting partners.
restop100$data_with_pval[N_human_per_human_w_domain >= 10, unique(IDs_domain_human_B)]
## [1] "IPR013083" "IPR013087" "IPR000608" "IPR016135" "IPR027417"
## [6] "IPR001766" "IPR011991" "IPR000504" "IPR000980" "IPR001452"
## [11] "IPR001811" "IPR000719" "IPR011009" "IPR001841" "IPR011993"
## [16] "IPR003309" "IPR008916" "IPR032444" "IPR029071" "IPR001245"
## [21] "IPR020635" "IPR033899" "IPR004827" "IPR013783" "IPR027409"
## [26] "IPR027413" "IPR023410" "IPR003599" "IPR007110" "IPR013151"
## [31] "IPR015943" "IPR017986"
restop100$data_with_pval[N_human_per_human_w_domain >= 10, unique(IDs_domain_human_B)] %in% domains_known_mapped
## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE
## [12] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
restop100$data_with_pval[N_human_per_human_w_domain >= 10 & IDs_domain_human_B == "IPR000504", unique(IDs_interactor_human_B)]
## [1] "O00425" "O14979" "O43390" "O60506" "O60812" "O75494" "O95758"
## [8] "P05455" "P07910" "P08621" "P09012" "P09651" "P11940" "P14866"
## [15] "P19338" "P22626" "P23246" "P26368" "P26599" "P31483" "P31942"
## [22] "P31943" "P35637" "P38159" "P43243" "P51991" "P52272" "P52597"
## [29] "P55795" "P62995" "P84103" "P98175" "P98179" "Q01081" "Q01130"
## [36] "Q01844" "Q07955" "Q08170" "Q13151" "Q13242" "Q13243" "Q13247"
## [43] "Q13283" "Q13310" "Q13595" "Q14011" "Q14103" "Q14498" "Q15233"
## [50] "Q15287" "Q15415" "Q15427" "Q15717" "Q16629" "Q16630" "Q86V81"
## [57] "Q96PK6" "Q99729" "Q9BWF3" "Q9NR30" "Q9NZI8" "Q9UBU9" "Q9UHX1"
## [64] "Q9UKA9" "Q9UKM9" "Q9UKV3" "Q9Y5S9"
DT::datatable(restop100$data_with_pval[N_human_per_human_w_domain >= 10,])
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
filename = paste0("./processed_data_files/predict_domain_human_clust",gsub("-","",Sys.Date()),".RData")
filename = paste0("./processed_data_files/predict_domain_human_clust",gsub("-","","2017-10-19"),".RData")
save(list = ls(), file=filename)
#R.utils::gzip(filename = "./processed_data_files/predict_domain_human_clust11092017.RData",
# destname = "./processed_data_files/predict_domain_human_clust11092017.RData.gz",
# remove = T, overwrite = T)
Sys.Date()
## [1] "2018-08-17"
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] clustermq_0.8.4.1 R.utils_2.6.0 R.oo_1.22.0
## [4] R.methodsS3_1.7.1 RColorBrewer_1.1-2 GGally_1.4.0
## [7] ggplot2_3.0.0 rtracklayer_1.40.4 GenomicRanges_1.32.6
## [10] GenomeInfoDb_1.16.0 MItools_0.1.38 Biostrings_2.48.0
## [13] XVector_0.20.0 data.table_1.11.4 PSICQUIC_1.18.1
## [16] plyr_1.8.4 httr_1.3.1 biomaRt_2.36.1
## [19] IRanges_2.14.10 S4Vectors_0.18.3 BiocGenerics_0.26.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 matrixStats_0.54.0
## [3] bit64_0.9-7 progress_1.2.0
## [5] rprojroot_1.3-2 tools_3.5.1
## [7] backports_1.1.2 R6_2.2.2
## [9] DT_0.4 KernSmooth_2.23-15
## [11] DBI_1.0.0 lazyeval_0.2.1
## [13] colorspace_1.3-2 withr_2.1.2
## [15] tidyselect_0.2.4 prettyunits_1.0.2
## [17] ontologyIndex_2.4 bit_1.1-14
## [19] compiler_3.5.1 Biobase_2.40.0
## [21] DelayedArray_0.6.4 caTools_1.17.1.1
## [23] scales_1.0.0 stringr_1.3.1
## [25] digest_0.6.15 Rsamtools_1.32.2
## [27] rmarkdown_1.10 pkgconfig_2.0.1
## [29] htmltools_0.3.6 htmlwidgets_1.2
## [31] rlang_0.2.1 RSQLite_2.1.1
## [33] shiny_1.1.0 bindr_0.1.1
## [35] jsonlite_1.5 crosstalk_1.0.0
## [37] BiocParallel_1.14.2 gtools_3.8.1
## [39] dplyr_0.7.6 RCurl_1.95-4.11
## [41] magrittr_1.5 GenomeInfoDbData_1.1.0
## [43] Matrix_1.2-14 Rcpp_0.12.18
## [45] munsell_0.5.0 proto_1.0.0
## [47] stringi_1.2.4 yaml_2.2.0
## [49] SummarizedExperiment_1.10.1 zlibbioc_1.26.0
## [51] gplots_3.0.1 qvalue_2.12.0
## [53] grid_3.5.1 blob_1.1.1
## [55] promises_1.0.1 gdata_2.18.0
## [57] crayon_1.3.4 lattice_0.20-35
## [59] splines_3.5.1 hms_0.4.2
## [61] knitr_1.20 pillar_1.3.0
## [63] reshape2_1.4.3 XML_3.98-1.15
## [65] glue_1.3.0 evaluate_0.11
## [67] downloader_0.4 httpuv_1.4.5
## [69] gtable_0.2.0 purrr_0.2.5
## [71] reshape_0.8.7 assertthat_0.2.0
## [73] gsubfn_0.7 mime_0.5
## [75] xtable_1.8-2 later_0.7.3
## [77] tibble_1.4.2 GenomicAlignments_1.16.0
## [79] AnnotationDbi_1.42.1 memoise_1.1.0
## [81] bindrcpp_0.2.2 rzmq_0.9.3
## [83] ROCR_1.0-7